import numpy as np 
import matplotlib.pyplot as plt 
import matplotlib.colors as mcolors
import pandas as pd 
import random
import math
import time
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error
import datetime
import operator 
plt.style.use('fivethirtyeight')
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

Load the DataFrame from th Sources.

confirmed_df = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
deaths_df = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')
recoveries_df = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')
latest_data = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/10-07-2020.csv')
us_medical_data = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports_us/10-07-2020.csv')
apple_mobility = pd.read_csv('https://covid19-static.cdn-apple.com/covid19-mobility-data/2018HotfixDev19/v3/en-us/applemobilitytrends-2020-10-07.csv')
---------------------------------------------------------------------------
HTTPError                                 Traceback (most recent call last)
<ipython-input-3-e0928034943f> in <module>
      4 latest_data = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/10-07-2020.csv')
      5 us_medical_data = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports_us/10-07-2020.csv')
----> 6 apple_mobility = pd.read_csv('https://covid19-static.cdn-apple.com/covid19-mobility-data/2018HotfixDev19/v3/en-us/applemobilitytrends-2020-10-07.csv')

~\anaconda3\lib\site-packages\pandas\io\parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
    674         )
    675 
--> 676         return _read(filepath_or_buffer, kwds)
    677 
    678     parser_f.__name__ = name

~\anaconda3\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
    428     # though mypy handling of conditional imports is difficult.
    429     # See https://github.com/python/mypy/issues/1297
--> 430     fp_or_buf, _, compression, should_close = get_filepath_or_buffer(
    431         filepath_or_buffer, encoding, compression
    432     )

~\anaconda3\lib\site-packages\pandas\io\common.py in get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode)
    170 
    171     if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer):
--> 172         req = urlopen(filepath_or_buffer)
    173         content_encoding = req.headers.get("Content-Encoding", None)
    174         if content_encoding == "gzip":

~\anaconda3\lib\site-packages\pandas\io\common.py in urlopen(*args, **kwargs)
    139     import urllib.request
    140 
--> 141     return urllib.request.urlopen(*args, **kwargs)
    142 
    143 

~\anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    220     else:
    221         opener = _opener
--> 222     return opener.open(url, data, timeout)
    223 
    224 def install_opener(opener):

~\anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
    529         for processor in self.process_response.get(protocol, []):
    530             meth = getattr(processor, meth_name)
--> 531             response = meth(req, response)
    532 
    533         return response

~\anaconda3\lib\urllib\request.py in http_response(self, request, response)
    638         # request was successfully received, understood, and accepted.
    639         if not (200 <= code < 300):
--> 640             response = self.parent.error(
    641                 'http', request, response, code, msg, hdrs)
    642 

~\anaconda3\lib\urllib\request.py in error(self, proto, *args)
    567         if http_err:
    568             args = (dict, 'default', 'http_error_default') + orig_args
--> 569             return self._call_chain(*args)
    570 
    571 # XXX probably also want an abstract factory that knows when it makes

~\anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
    500         for handler in handlers:
    501             func = getattr(handler, meth_name)
--> 502             result = func(*args)
    503             if result is not None:
    504                 return result

~\anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
    647 class HTTPDefaultErrorHandler(BaseHandler):
    648     def http_error_default(self, req, fp, code, msg, hdrs):
--> 649         raise HTTPError(req.full_url, code, msg, hdrs, fp)
    650 
    651 class HTTPRedirectHandler(BaseHandler):

HTTPError: HTTP Error 404: Not Found
confirmed_df.head()
Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 ... 11/3/20 11/4/20 11/5/20 11/6/20 11/7/20 11/8/20 11/9/20 11/10/20 11/11/20 11/12/20
0 NaN Afghanistan 33.93911 67.709953 0 0 0 0 0 0 ... 41728 41814 41935 41975 42033 42092 42297 42463 42609 42795
1 NaN Albania 41.15330 20.168300 0 0 0 0 0 0 ... 21904 22300 22721 23210 23705 24206 24731 25294 25801 26211
2 NaN Algeria 28.03390 1.659600 0 0 0 0 0 0 ... 58979 59527 60169 60800 61381 62051 62693 63446 64257 65108
3 NaN Andorra 42.50630 1.521800 0 0 0 0 0 0 ... 4910 5045 5135 5135 5319 5383 5437 5477 5567 5616
4 NaN Angola -11.20270 17.873900 0 0 0 0 0 0 ... 11577 11813 12102 12223 12335 12433 12680 12816 12953 13053

5 rows × 300 columns

latest_data.head()
FIPS Admin2 Province_State Country_Region Last_Update Lat Long_ Confirmed Deaths Recovered Active Combined_Key Incidence_Rate Case-Fatality_Ratio
0 NaN NaN NaN Afghanistan 2020-10-08 04:23:56 33.93911 67.709953 39548 1469 33045 5034.0 Afghanistan 101.591794 3.714474
1 NaN NaN NaN Albania 2020-10-08 04:23:56 41.15330 20.168300 14730 407 9115 5208.0 Albania 511.849329 2.763069
2 NaN NaN NaN Algeria 2020-10-08 04:23:56 28.03390 1.659600 52520 1771 36857 13892.0 Algeria 119.769101 3.372049
3 NaN NaN NaN Andorra 2020-10-08 04:23:56 42.50630 1.521800 2568 53 1715 800.0 Andorra 3323.626480 2.063863
4 NaN NaN NaN Angola 2020-10-08 04:23:56 -11.20270 17.873900 5725 211 2598 2916.0 Angola 17.419075 3.685590
us_medical_data.head()
Province_State Country_Region Last_Update Lat Long_ Confirmed Deaths Recovered Active FIPS Incident_Rate People_Tested People_Hospitalized Mortality_Rate UID ISO3 Testing_Rate Hospitalization_Rate
0 Alabama US 2020-10-08 04:30:35 32.3182 -86.9023 161418 2601 67948.0 90869.0 1.0 3292.105030 1180818.0 NaN 1.611344 84000001 USA 24082.672793 NaN
1 Alaska US 2020-10-08 04:30:35 61.3707 -152.4044 8878 59 5626.0 3193.0 2.0 1213.595883 490074.0 NaN 0.664564 84000002 USA 66991.640979 NaN
2 American Samoa US 2020-10-08 04:30:35 -14.2710 -170.1320 0 0 NaN 0.0 60.0 0.000000 1616.0 NaN NaN 16 ASM 2904.333136 NaN
3 Arizona US 2020-10-08 04:30:35 33.7298 -111.4312 222538 5733 36336.0 180469.0 4.0 3057.379480 1518694.0 NaN 2.576189 84000004 USA 20864.858463 NaN
4 Arkansas US 2020-10-08 04:30:35 34.9697 -92.3731 88880 1482 80703.0 6695.0 5.0 2945.187958 1087671.0 NaN 1.667417 84000005 USA 36041.803908 NaN
cols = confirmed_df.keys()
confirmed = confirmed_df.loc[:, cols[4]:cols[-1]]
deaths = deaths_df.loc[:, cols[4]:cols[-1]]
recoveries = recoveries_df.loc[:, cols[4]:cols[-1]]
dates = confirmed.keys()
world_cases = []
total_deaths = [] 
mortality_rate = []
recovery_rate = [] 
total_recovered = [] 
total_active = [] 

for i in dates:
    confirmed_sum = confirmed[i].sum()
    death_sum = deaths[i].sum()
    recovered_sum = recoveries[i].sum()
    
    # confirmed, deaths, recovered, and active
    world_cases.append(confirmed_sum)
    total_deaths.append(death_sum)
    total_recovered.append(recovered_sum)
    total_active.append(confirmed_sum-death_sum-recovered_sum)
    
    # calculate rates
    mortality_rate.append(death_sum/confirmed_sum)
    recovery_rate.append(recovered_sum/confirmed_sum)

Getting daily increase and moving averages.

def daily_increase(data):
    d = [] 
    for i in range(len(data)):
        if i == 0:
            d.append(data[0])
        else:
            d.append(data[i]-data[i-1])
    return d 

def moving_average(data, window_size):
    moving_average = []
    for i in range(len(data)):
        if i + window_size < len(data):
            moving_average.append(np.mean(data[i:i+window_size]))
        else:
            moving_average.append(np.mean(data[i:len(data)]))
    return moving_average

# window size
window = 7

# confirmed cases
world_daily_increase = daily_increase(world_cases)
world_confirmed_avg= moving_average(world_cases, window)
world_daily_increase_avg = moving_average(world_daily_increase, window)

# deaths
world_daily_death = daily_increase(total_deaths)
world_death_avg = moving_average(total_deaths, window)
world_daily_death_avg = moving_average(world_daily_death, window)


# recoveries
world_daily_recovery = daily_increase(total_recovered)
world_recovery_avg = moving_average(total_recovered, window)
world_daily_recovery_avg = moving_average(world_daily_recovery, window)


# active 
world_active_avg = moving_average(total_active, window)
days_since_1_22 = np.array([i for i in range(len(dates))]).reshape(-1, 1)
world_cases = np.array(world_cases).reshape(-1, 1)
total_deaths = np.array(total_deaths).reshape(-1, 1)
total_recovered = np.array(total_recovered).reshape(-1, 1)

Future Forecasting

days_in_future = 10
future_forcast = np.array([i for i in range(len(dates)+days_in_future)]).reshape(-1, 1)
adjusted_dates = future_forcast[:-10]

Converting Integer into Datatime for better Visualization

start = '1/22/2020'
start_date = datetime.datetime.strptime(start, '%m/%d/%Y')
future_forcast_dates = []
for i in range(len(future_forcast)):
    future_forcast_dates.append((start_date + datetime.timedelta(days=i)).strftime('%m/%d/%Y'))

Slightly modify the data to fit the model better (regression models cannot pick the pattern)

X_train_confirmed, X_test_confirmed, y_train_confirmed, y_test_confirmed = train_test_split(days_since_1_22[50:], world_cases[50:], test_size=0.05, shuffle=False) 

Model for predicting number of confirmed cases. I am using support vector machine, bayesian ridge , and linear regression .

# c = [0.01, 0.1, 1]
# gamma = [0.01, 0.1, 1]
# epsilon = [0.01, 0.1, 1]
# shrinking = [True, False]
# degree = [3, 4, 5]

# svm_grid = {'C': c, 'gamma' : gamma, 'epsilon': epsilon, 'shrinking' : shrinking, 'degree': degree}

# svm = SVR(kernel='poly')
# svm_search = RandomizedSearchCV(svm, svm_grid, scoring='neg_mean_squared_error', cv=3, return_train_score=True, n_jobs=-1, n_iter=30, verbose=1)
# svm_search.fit(X_train_confirmed, y_train_confirmed)
 
svm_confirmed = SVR(shrinking=True, kernel='poly',gamma=0.01, epsilon=1,degree=3, C=0.1)
svm_confirmed.fit(X_train_confirmed, y_train_confirmed)
svm_pred = svm_confirmed.predict(future_forcast)

Check against testing data

svm_test_pred = svm_confirmed.predict(X_test_confirmed)
plt.plot(y_test_confirmed)
plt.plot(svm_test_pred)
plt.legend(['Test Data', 'SVM Predictions'])
print('MAE:', mean_absolute_error(svm_test_pred, y_test_confirmed))
print('MSE:',mean_squared_error(svm_test_pred, y_test_confirmed))
MAE: 2344622.5616368484
MSE: 5523754241193.524

Transform our data for polynomial regression

poly = PolynomialFeatures(degree=5)
poly_X_train_confirmed = poly.fit_transform(X_train_confirmed)
poly_X_test_confirmed = poly.fit_transform(X_test_confirmed)
poly_future_forcast = poly.fit_transform(future_forcast)

bayesian_poly = PolynomialFeatures(degree=5)
bayesian_poly_X_train_confirmed = bayesian_poly.fit_transform(X_train_confirmed)
bayesian_poly_X_test_confirmed = bayesian_poly.fit_transform(X_test_confirmed)
bayesian_poly_future_forcast = bayesian_poly.fit_transform(future_forcast)

Polynomial regression

linear_model = LinearRegression(normalize=True, fit_intercept=False)
linear_model.fit(poly_X_train_confirmed, y_train_confirmed)
test_linear_pred = linear_model.predict(poly_X_test_confirmed)
linear_pred = linear_model.predict(poly_future_forcast)
print('MAE:', mean_absolute_error(test_linear_pred, y_test_confirmed))
print('MSE:',mean_squared_error(test_linear_pred, y_test_confirmed))
MAE: 1206493.191649382
MSE: 1585980580691.81
print(linear_model.coef_)
[[-1.87018323e+07  7.19991429e+05 -1.01838910e+04  7.06102955e+01
  -2.13579009e-01  2.45409901e-04]]
plt.plot(y_test_confirmed)
plt.plot(test_linear_pred)
plt.legend(['Test Data', 'Polynomial Regression Predictions'])
<matplotlib.legend.Legend at 0x162b9686a30>

Bayesian ridge polynomial regression

tol = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2]
alpha_1 = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
alpha_2 = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
lambda_1 = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
lambda_2 = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
normalize = [True, False]

bayesian_grid = {'tol': tol, 'alpha_1': alpha_1, 'alpha_2' : alpha_2, 'lambda_1': lambda_1, 'lambda_2' : lambda_2, 
                 'normalize' : normalize}

bayesian = BayesianRidge(fit_intercept=False)
bayesian_search = RandomizedSearchCV(bayesian, bayesian_grid, scoring='neg_mean_squared_error', cv=3, return_train_score=True, n_jobs=-1, n_iter=40, verbose=1)
bayesian_search.fit(bayesian_poly_X_train_confirmed, y_train_confirmed)
Fitting 3 folds for each of 40 candidates, totalling 120 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 105 out of 120 | elapsed:    1.5s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    1.5s finished
RandomizedSearchCV(cv=3, estimator=BayesianRidge(fit_intercept=False),
                   n_iter=40, n_jobs=-1,
                   param_distributions={'alpha_1': [1e-07, 1e-06, 1e-05, 0.0001,
                                                    0.001],
                                        'alpha_2': [1e-07, 1e-06, 1e-05, 0.0001,
                                                    0.001],
                                        'lambda_1': [1e-07, 1e-06, 1e-05,
                                                     0.0001, 0.001],
                                        'lambda_2': [1e-07, 1e-06, 1e-05,
                                                     0.0001, 0.001],
                                        'normalize': [True, False],
                                        'tol': [1e-06, 1e-05, 0.0001, 0.001,
                                                0.01]},
                   return_train_score=True, scoring='neg_mean_squared_error',
                   verbose=1)
bayesian_search.best_params_
{'tol': 0.01,
 'normalize': True,
 'lambda_2': 1e-07,
 'lambda_1': 0.001,
 'alpha_2': 1e-07,
 'alpha_1': 1e-05}
bayesian_confirmed = bayesian_search.best_estimator_
test_bayesian_pred = bayesian_confirmed.predict(bayesian_poly_X_test_confirmed)
bayesian_pred = bayesian_confirmed.predict(bayesian_poly_future_forcast)
print('MAE:', mean_absolute_error(test_bayesian_pred, y_test_confirmed))
print('MSE:',mean_squared_error(test_bayesian_pred, y_test_confirmed))
MAE: 2316291.1614478836
MSE: 5926142585167.471
plt.plot(y_test_confirmed)
plt.plot(test_bayesian_pred)
plt.legend(['Test Data', 'Bayesian Ridge Polynomial Predictions'])
<matplotlib.legend.Legend at 0x162bb75cd00>

Graphing the number of confirmed cases, active cases, deaths, recoveries, mortality rate (CFR), and recovery rate

adjusted_dates = adjusted_dates.reshape(1, -1)[0]
plt.figure(figsize=(16, 10))
plt.plot(adjusted_dates, world_cases)
plt.plot(adjusted_dates, world_confirmed_avg, linestyle='dashed', color='orange')
plt.title('# of Coronavirus Cases Over Time', size=30)
plt.xlabel('Days Since 1/22/2020', size=30)
plt.ylabel('# of Cases', size=30)
plt.legend(['Worldwide Coronavirus Cases', 'Moving Average {} Days'.format(window)], prop={'size': 20})
plt.xticks(size=20)
plt.yticks(size=20)
plt.show()
plt.savefig("corona.png", bbox_inches='tight', dpi=600)
<Figure size 432x288 with 0 Axes>
plt.figure(figsize=(16, 10))
plt.plot(adjusted_dates, total_deaths)
plt.plot(adjusted_dates, world_death_avg, linestyle='dashed', color='orange')
plt.title('# of Coronavirus Deaths Over Time', size=30)
plt.xlabel('Days Since 1/22/2020', size=30)
plt.ylabel('# of Cases', size=30)
plt.legend(['Worldwide Coronavirus Deaths', 'Moving Average {} Days'.format(window)], prop={'size': 20})
plt.xticks(size=20)
plt.yticks(size=20)
plt.show()
plt.figure(figsize=(16, 10))
plt.plot(adjusted_dates, total_recovered)
plt.plot(adjusted_dates, world_recovery_avg, linestyle='dashed', color='orange')
plt.title('# of Coronavirus Recoveries Over Time', size=30)
plt.xlabel('Days Since 1/22/2020', size=30)
plt.ylabel('# of Cases', size=30)
plt.legend(['Worldwide Coronavirus Recoveries', 'Moving Average {} Days'.format(window)], prop={'size': 20})
plt.xticks(size=20)
plt.yticks(size=20)
plt.show()
plt.figure(figsize=(16, 10))
plt.plot(adjusted_dates, total_active)
plt.plot(adjusted_dates, world_active_avg, linestyle='dashed', color='orange')
plt.title('# of Coronavirus Active Cases Over Time', size=30)
plt.xlabel('Days Since 1/22/2020', size=30)
plt.ylabel('# of Active Cases', size=30)
plt.legend(['Worldwide Coronavirus Active Cases', 'Moving Average {} Days'.format(window)], prop={'size': 20})
plt.xticks(size=20)
plt.yticks(size=20)
plt.show()
f=plt.figure(figsize=(16, 10))
plt.bar(adjusted_dates, world_daily_increase)

plt.plot(adjusted_dates, world_daily_increase_avg, color='orange', linestyle='dashed')
plt.title('World Daily Increases in Confirmed Cases', size=30)
plt.xlabel('Days Since 1/22/2020', size=30)
plt.ylabel('# of Cases', size=30)
plt.legend(['Moving Average {} Days'.format(window), 'World Daily Increase in COVID-19 Cases'], prop={'size': 20})
plt.xticks(size=20)
plt.yticks(size=20)
plt.style.use('dark_background')
plt.show()

f.savefig("corona10.jpg")
plt.figure(figsize=(16, 10))
plt.bar(adjusted_dates, world_daily_death)
plt.plot(adjusted_dates, world_daily_death_avg, color='orange', linestyle='dashed')
plt.title('World Daily Increases in Confirmed Deaths', size=30)
plt.xlabel('Days Since 1/22/2020', size=30)
plt.ylabel('# of Cases', size=30)
plt.legend(['Moving Average {} Days'.format(window), 'World Daily Increase in COVID-19 Deaths'], prop={'size': 20})
plt.xticks(size=20)
plt.yticks(size=20)
plt.show()
plt.figure(figsize=(16, 10))
plt.bar(adjusted_dates, world_daily_recovery)
plt.plot(adjusted_dates, world_daily_recovery_avg, color='orange', linestyle='dashed')
plt.title('World Daily Increases in Confirmed Recoveries', size=30)
plt.xlabel('Days Since 1/22/2020', size=30)
plt.ylabel('# of Cases', size=30)
plt.legend(['Moving Average {} Days'.format(window), 'World Daily Increase in COVID-19 Recoveries'], prop={'size': 20})
plt.xticks(size=20)
plt.yticks(size=20)
plt.show()
plt.figure(figsize=(16, 10))
plt.plot(adjusted_dates, np.log10(world_cases))
plt.title('Log of # of Coronavirus Cases Over Time', size=30)
plt.xlabel('Days Since 1/22/2020', size=30)
plt.ylabel('# of Cases', size=30)
plt.xticks(size=20)
plt.yticks(size=20)
plt.show()
plt.figure(figsize=(16, 10))
plt.plot(adjusted_dates, np.log10(total_deaths))
plt.title('Log of # of Coronavirus Deaths Over Time', size=30)
plt.xlabel('Days Since 1/22/2020', size=30)
plt.ylabel('# of Cases', size=30)
plt.xticks(size=20)
plt.yticks(size=20)
plt.show()
plt.figure(figsize=(16, 10))
plt.plot(adjusted_dates, np.log10(total_recovered))
plt.title('Log of # of Coronavirus Recoveries Over Time', size=30)
plt.xlabel('Days Since 1/22/2020', size=30)
plt.ylabel('# of Cases', size=30)
plt.xticks(size=20)
plt.yticks(size=20)
plt.show()
def country_plot(x, y1, y2, y3, y4, country):
    # window is set as 14 in in the beginning of the notebook 
    confirmed_avg = moving_average(y1, window)
    confirmed_increase_avg = moving_average(y2, window)
    death_increase_avg = moving_average(y3, window)
    recovery_increase_avg = moving_average(y4, window)
    
    plt.figure(figsize=(16, 10))
    plt.plot(x, y1)
    plt.plot(x, confirmed_avg, color='red', linestyle='dashed')
    plt.legend(['{} Confirmed Cases'.format(country), 'Moving Average {} Days'.format(window)], prop={'size': 20})
    plt.title('{} Confirmed Cases'.format(country), size=30)
    plt.xlabel('Days Since 1/22/2020', size=30)
    plt.ylabel('# of Cases', size=30)
    plt.xticks(size=20)
    plt.yticks(size=20)
    plt.show()

    plt.figure(figsize=(16, 10))
    plt.bar(x, y2)
    plt.plot(x, confirmed_increase_avg, color='red', linestyle='dashed')
    plt.legend(['Moving Average {} Days'.format(window), '{} Daily Increase in Confirmed Cases'.format(country)], prop={'size': 20})
    plt.title('{} Daily Increases in Confirmed Cases'.format(country), size=30)
    plt.xlabel('Days Since 1/22/2020', size=30)
    plt.ylabel('# of Cases', size=30)
    plt.xticks(size=20)
    plt.yticks(size=20)
    plt.show()

    plt.figure(figsize=(16, 10))
    plt.bar(x, y3)
    plt.plot(x, death_increase_avg, color='red', linestyle='dashed')
    plt.legend(['Moving Average {} Days'.format(window), '{} Daily Increase in Confirmed Deaths'.format(country)], prop={'size': 20})
    plt.title('{} Daily Increases in Deaths'.format(country), size=30)
    plt.xlabel('Days Since 1/22/2020', size=30)
    plt.ylabel('# of Cases', size=30)
    plt.xticks(size=20)
    plt.yticks(size=20)
    plt.show()

    plt.figure(figsize=(16, 10))
    plt.bar(x, y4)
    plt.plot(x, recovery_increase_avg, color='red', linestyle='dashed')
    plt.legend(['Moving Average {} Days'.format(window), '{} Daily Increase in Confirmed Recoveries'.format(country)], prop={'size': 20})
    plt.title('{} Daily Increases in Recoveries'.format(country), size=30)
    plt.xlabel('Days Since 1/22/2020', size=30)
    plt.ylabel('# of Cases', size=30)
    plt.xticks(size=20)
    plt.yticks(size=20)
    plt.show()
      
def get_country_info(country_name):
    country_cases = []
    country_deaths = []
    country_recoveries = []  
    
    for i in dates:
        country_cases.append(confirmed_df[confirmed_df['Country/Region']==country_name][i].sum())
        country_deaths.append(deaths_df[deaths_df['Country/Region']==country_name][i].sum())
        country_recoveries.append(recoveries_df[recoveries_df['Country/Region']==country_name][i].sum())
    return (country_cases, country_deaths, country_recoveries)
    
    
def country_visualizations(country_name):
    country_info = get_country_info(country_name)
    country_cases = country_info[0]
    country_deaths = country_info[1]
    country_recoveries = country_info[2]
    
    country_daily_increase = daily_increase(country_cases)
    country_daily_death = daily_increase(country_deaths)
    country_daily_recovery = daily_increase(country_recoveries)
    
    country_plot(adjusted_dates, country_cases, country_daily_increase, country_daily_death, country_daily_recovery, country_name)
    

Country Specific Graphs.

countries = ['India','US', 'Russia',  'Brazil', 'South Africa', 'China', 'Italy',
             'Germany', 'Spain', 'France', 'United Kingdom', 'Peru', 'Mexico', 'Colombia', 'Saudi Arabia', 'Iran', 'Bangladesh',
            'Pakistan', 'Turkey', 'Philippines', 'Iraq', 'Indonesia', 'Israel', 'Ukraine', 'Ecuador', 'Bolivia', 'Netherlands'] 

for country in countries:
    country_visualizations(country)

Country Comparison

compare_countries = ['India', 'US', 'Brazil', 'Russia', 'South Africa'] 
graph_name = ['Coronavirus Confirmed Cases', 'Coronavirus Confirmed Deaths', 'Coronavirus Confirmed Recoveries']

for num in range(3):
    plt.figure(figsize=(16, 10))
    for country in compare_countries:
        plt.plot(get_country_info(country)[num])
    plt.legend(compare_countries, prop={'size': 20})
    plt.xlabel('Days since 3/1', size=30)
    plt.ylabel('# of Cases', size=30)
    plt.title(graph_name[num], size=30)
    plt.xticks(size=20)
    plt.yticks(size=20)
    plt.show()
def plot_predictions(x, y, pred, algo_name, color):
    plt.figure(figsize=(16, 10))
    plt.plot(x, y)
    plt.plot(future_forcast, pred, linestyle='dashed', color=color)
    plt.title('Worldwide Coronavirus Cases Over Time', size=30)
    plt.xlabel('Days Since 1/22/2020', size=30)
    plt.ylabel('# of Cases', size=30)
    plt.legend(['Confirmed Cases', algo_name], prop={'size': 20})
    plt.xticks(size=20)
    plt.yticks(size=20)
    plt.show()

Predictions for confirmed coronavirus cases worldwide

plot_predictions(adjusted_dates, world_cases, svm_pred, 'SVM Predictions', 'purple')
plot_predictions(adjusted_dates, world_cases, linear_pred, 'Polynomial Regression Predictions', 'orange')
plot_predictions(adjusted_dates, world_cases, bayesian_pred, 'Bayesian Ridge Regression Predictions', 'green')

Future predictions using SVM

 
svm_df = pd.DataFrame({'Date': future_forcast_dates[-10:], 'SVM Predicted # of Confirmed Cases Worldwide': np.round(svm_pred[-10:])})
svm_df.style.background_gradient(cmap='Reds')
Date SVM Predicted # of Confirmed Cases Worldwide
0 10/10/2020 41564298.000000
1 10/11/2020 42028255.000000
2 10/12/2020 42495753.000000
3 10/13/2020 42966806.000000
4 10/14/2020 43441428.000000
5 10/15/2020 43919632.000000
6 10/16/2020 44401431.000000
7 10/17/2020 44886840.000000
8 10/18/2020 45375870.000000
9 10/19/2020 45868537.000000

Future predictions using polynomial regression

linear_pred = linear_pred.reshape(1,-1)[0]
linear_df = pd.DataFrame({'Date': future_forcast_dates[-10:], 'Polynomial Predicted # of Confirmed Cases Worldwide': np.round(linear_pred[-10:])})
linear_df.style.background_gradient(cmap='Reds')
Date Polynomial Predicted # of Confirmed Cases Worldwide
0 10/10/2020 35561907.000000
1 10/11/2020 35742041.000000
2 10/12/2020 35916673.000000
3 10/13/2020 36085620.000000
4 10/14/2020 36248697.000000
5 10/15/2020 36405715.000000
6 10/16/2020 36556484.000000
7 10/17/2020 36700810.000000
8 10/18/2020 36838496.000000
9 10/19/2020 36969344.000000

Future predictions using Bayesian Ridge

bayesian_df = pd.DataFrame({'Date': future_forcast_dates[-10:], 'Bayesian Ridge Predicted # of Confirmed Cases Worldwide': np.round(bayesian_pred[-10:])})
bayesian_df.style.background_gradient(cmap='Reds')
Date Bayesian Ridge Predicted # of Confirmed Cases Worldwide
0 10/10/2020 37208289.000000
1 10/11/2020 37515075.000000
2 10/12/2020 37822044.000000
3 10/13/2020 38129164.000000
4 10/14/2020 38436406.000000
5 10/15/2020 38743739.000000
6 10/16/2020 39051133.000000
7 10/17/2020 39358556.000000
8 10/18/2020 39665976.000000
9 10/19/2020 39973362.000000

Mortality Rate (worldwide) susceptible to change

mean_mortality_rate = np.mean(mortality_rate)
plt.figure(figsize=(16, 10))
plt.plot(adjusted_dates, mortality_rate, color='orange')
plt.axhline(y = mean_mortality_rate,linestyle='--', color='black')
plt.title('Worldwide Mortality Rate of Coronavirus Over Time', size=30)
plt.legend(['mortality rate', 'y='+str(mean_mortality_rate)], prop={'size': 20})
plt.xlabel('Days Since 1/22/2020', size=30)
plt.ylabel('Case Mortality Rate', size=30)
plt.xticks(size=20)
plt.yticks(size=20)
plt.show()

Recovery Rate (worldwide) suceptible to change

mean_recovery_rate = np.mean(recovery_rate)
plt.figure(figsize=(16, 10))
plt.plot(adjusted_dates, recovery_rate, color='blue')
plt.axhline(y = mean_recovery_rate,linestyle='--', color='black')
plt.title('Worldwide Recovery Rate of Coronavirus Over Time', size=30)
plt.legend(['recovery rate', 'y='+str(mean_recovery_rate)], prop={'size': 20})
plt.xlabel('Days Since 1/22/2020', size=30)
plt.ylabel('Case Recovery Rate', size=30)
plt.xticks(size=20)
plt.yticks(size=20)
plt.show()

Graphing deaths against recoveries

plt.figure(figsize=(16, 10))
plt.plot(adjusted_dates, total_deaths, color='r')
plt.plot(adjusted_dates, total_recovered, color='green')
plt.legend(['death', 'recoveries'], loc='best', fontsize=25)
plt.title('Worldwide Coronavirus Cases', size=30)
plt.xlabel('Days Since 1/22/2020', size=30)
plt.ylabel('# of Cases', size=30)
plt.xticks(size=20)
plt.yticks(size=20)
plt.show()

Plotting the number of deaths against the number of recoveries

plt.figure(figsize=(16, 10))
plt.plot(total_recovered, total_deaths)
plt.title('# of Coronavirus Deaths vs. # of Coronavirus Recoveries', size=30)
plt.xlabel('# of Coronavirus Recoveries', size=30)
plt.ylabel('# of Coronavirus Deaths', size=30)
plt.xticks(size=20)
plt.yticks(size=20)
plt.show()

Getting information about countries/regions that have confirmed coronavirus cases

unique_countries =  list(latest_data['Country_Region'].unique())
country_confirmed_cases = []
country_death_cases = [] 
country_active_cases = []
country_recovery_cases = []
country_incidence_rate = [] 
country_mortality_rate = [] 

no_cases = []
for i in unique_countries:
    cases = latest_data[latest_data['Country_Region']==i]['Confirmed'].sum()
    if cases > 0:
        country_confirmed_cases.append(cases)
    else:
        no_cases.append(i)
        
for i in no_cases:
    unique_countries.remove(i)
    
# sort countries by the number of confirmed cases
unique_countries = [k for k, v in sorted(zip(unique_countries, country_confirmed_cases), key=operator.itemgetter(1), reverse=True)]
for i in range(len(unique_countries)):
    country_confirmed_cases[i] = latest_data[latest_data['Country_Region']==unique_countries[i]]['Confirmed'].sum()
    country_death_cases.append(latest_data[latest_data['Country_Region']==unique_countries[i]]['Deaths'].sum())
    country_recovery_cases.append(latest_data[latest_data['Country_Region']==unique_countries[i]]['Recovered'].sum())
    country_active_cases.append(latest_data[latest_data['Country_Region']==unique_countries[i]]['Active'].sum())
    country_incidence_rate.append(latest_data[latest_data['Country_Region']==unique_countries[i]]['Incidence_Rate'].sum())
    country_mortality_rate.append(country_death_cases[i]/country_confirmed_cases[i])

Data table

country_df = pd.DataFrame({'Country Name': unique_countries, 'Number of Confirmed Cases': country_confirmed_cases,
                          'Number of Deaths': country_death_cases, 'Number of Recoveries' : country_recovery_cases, 
                          'Number of Active Cases' : country_active_cases, 'Incidence Rate' : country_incidence_rate,
                          'Mortality Rate': country_mortality_rate})
# number of cases per country/region

country_df.style.background_gradient(cmap='Oranges')
Country Name Number of Confirmed Cases Number of Deaths Number of Recoveries Number of Active Cases Incidence Rate Mortality Rate
0 US 7549682 211801 2999895 4337988.000000 6591745.216429 0.028054
1 India 6835655 105526 5827704 902425.000000 23997.512161 0.015438
2 Brazil 5000694 148228 4457172 395294.000000 86700.387291 0.029641
3 Russia 1242258 21755 991277 229226.000000 73285.074851 0.017512
4 Colombia 877684 27180 773973 76531.000000 49362.211131 0.030968
5 Argentina 840915 22226 670725 147964.000000 1860.605251 0.026431
6 Spain 835901 32562 150376 652963.000000 32514.575797 0.038954
7 Peru 832929 32914 723606 76409.000000 62429.131575 0.039516
8 Mexico 799188 82726 679693 36769.000000 20739.385260 0.103513
9 France 693603 32463 102061 559021.000000 11206.745526 0.046803
10 South Africa 685155 17248 618127 49780.000000 1155.235430 0.025174
11 United Kingdom 546952 42605 2425 501922.000000 8642.449429 0.077895
12 Iran 483844 27658 397109 59077.000000 576.053089 0.057163
13 Chile 474440 13090 447053 14298.000000 36175.766845 0.027590
14 Iraq 391044 9604 319784 61656.000000 972.202053 0.024560
15 Bangladesh 373151 5440 286631 81080.000000 226.578662 0.014579
16 Saudi Arabia 337711 4947 323208 9556.000000 970.047366 0.014649
17 Italy 333940 36061 235303 62576.000000 11033.927138 0.107986
18 Philippines 329637 5925 273723 49989.000000 300.815602 0.017974
19 Turkey 329138 8609 288954 31575.000000 390.255681 0.026156
20 Pakistan 316934 6544 302375 8015.000000 1935.790852 0.020648
21 Indonesia 315714 11472 240291 63951.000000 115.424766 0.036337
22 Germany 311137 9582 269722 31833.000000 4970.416123 0.030797
23 Israel 281481 1824 216613 63044.000000 3252.032426 0.006480
24 Ukraine 245698 4707 109968 131023.000000 15849.050872 0.019158
25 Canada 175380 9593 147814 17974.000000 2601.745100 0.054698
26 Netherlands 155465 6574 4607 144284.000000 14471.340177 0.042286
27 Ecuador 143531 11743 120511 11277.000000 813.526678 0.081815
28 Romania 142570 5203 111564 25803.000000 741.097602 0.036494
29 Morocco 140024 2439 118142 19443.000000 379.360290 0.017418
30 Belgium 137868 10108 19895 107865.000000 1189.582122 0.073317
31 Bolivia 137706 8192 99268 30246.000000 1179.693805 0.059489
32 Qatar 127181 218 124108 2855.000000 4414.382207 0.001714
33 Panama 117300 2448 93610 21242.000000 2718.570268 0.020870
34 Dominican Republic 116148 2159 92157 21832.000000 1070.695316 0.018588
35 Kuwait 108743 639 100776 7328.000000 2546.338738 0.005876
36 Kazakhstan 108454 1746 103604 3104.000000 577.598617 0.016099
37 Poland 107319 2792 75346 29181.000000 283.563083 0.026016
38 Egypt 104035 6010 97492 533.000000 101.661804 0.057769
39 Oman 103465 1000 91329 11136.000000 2026.094745 0.009665
40 United Arab Emirates 101840 436 91710 9694.000000 1029.685351 0.004281
41 Sweden 96677 5892 0 90785.000000 18122.012006 0.060945
42 Guatemala 95704 3335 84036 8333.000000 534.194648 0.034847
43 Czechia 95360 829 50767 43764.000000 890.467460 0.008693
44 Nepal 94253 578 68668 25007.000000 323.484302 0.006132
45 China 90687 4739 85588 360.000000 232.462721 0.052257
46 Japan 87039 1614 79123 6303.000000 1850.902203 0.018543
47 Costa Rica 83497 1024 50295 32178.000000 1639.087779 0.012264
48 Belarus 81505 874 75683 4948.000000 862.548748 0.010723
49 Portugal 81256 2040 51037 28179.000000 796.884720 0.025106
50 Honduras 81016 2466 30590 47960.000000 817.962710 0.030438
51 Ethiopia 80895 1255 35670 43970.000000 70.365761 0.015514
52 Venezuela 80404 671 71531 8202.000000 282.754822 0.008345
53 Bahrain 73932 262 69411 4259.000000 4344.895312 0.003544
54 Nigeria 59738 1113 51403 7222.000000 28.979392 0.018631
55 Uzbekistan 59579 491 56165 2923.000000 178.011431 0.008241
56 Moldova 58794 1406 42480 14908.000000 1457.474945 0.023914
57 Singapore 57840 27 57624 189.000000 988.659981 0.000467
58 Switzerland 57709 2082 47300 8327.000000 666.800083 0.036078
59 Armenia 53755 995 45110 7650.000000 1814.065308 0.018510
60 Algeria 52520 1771 36857 13892.000000 119.769101 0.033720
61 Austria 50848 830 40499 9519.000000 564.576301 0.016323
62 Lebanon 48377 433 21120 26824.000000 708.774611 0.008951
63 Kyrgyzstan 48097 1069 43798 3230.000000 737.210177 0.022226
64 Ghana 46829 303 46060 466.000000 150.706668 0.006470
65 Paraguay 46435 989 29270 16176.000000 651.031261 0.021299
66 West Bank and Gaza 42840 355 35953 6532.000000 839.766841 0.008287
67 Azerbaijan 41113 602 38858 1653.000000 405.486640 0.014643
68 Kenya 39907 748 31659 7500.000000 74.216171 0.018744
69 Ireland 39584 1816 23364 14404.000000 801.653207 0.045877
70 Afghanistan 39548 1469 33045 5034.000000 101.591794 0.037145
71 Libya 39513 608 22831 16074.000000 575.045112 0.015387
72 Serbia 34193 758 0 33435.000000 391.342017 0.022168
73 Hungary 33114 877 9149 23088.000000 342.782611 0.026484
74 Denmark 31201 663 24706 5832.000000 1530.729332 0.021249
75 El Salvador 29737 873 24643 4221.000000 458.465595 0.029357
76 Bosnia and Herzegovina 29075 908 22614 5553.000000 886.212725 0.031230
77 Australia 27206 897 24939 1370.000000 516.437774 0.032971
78 Tunisia 24542 364 5032 19146.000000 207.655413 0.014832
79 Korea, South 24422 427 22463 1532.000000 47.634853 0.017484
80 Bulgaria 22743 873 15448 6422.000000 327.310643 0.038385
81 Burma 21433 510 6084 14839.000000 39.391805 0.023795
82 Greece 20947 424 1347 19176.000000 200.967931 0.020242
83 Cameroon 20924 420 19764 740.000000 78.822072 0.020073
84 Jordan 20200 131 5575 14494.000000 197.978269 0.006485
85 Cote d'Ivoire 19935 120 19550 265.000000 75.573554 0.006020
86 North Macedonia 19413 772 15749 2892.000000 931.803128 0.039767
87 Croatia 18447 309 16308 1830.000000 449.349470 0.016751
88 Madagascar 16633 235 15808 590.000000 60.066406 0.014129
89 Kosovo 15938 638 14143 1157.000000 880.374466 0.040030
90 Zambia 15224 335 14342 547.000000 82.811338 0.022005
91 Senegal 15174 313 12998 1863.000000 90.623886 0.020627
92 Norway 15013 275 11863 2875.000000 276.929161 0.018317
93 Albania 14730 407 9115 5208.000000 511.849329 0.027631
94 Slovakia 14689 55 5200 9434.000000 269.046896 0.003744
95 Malaysia 13993 141 10501 3351.000000 43.233643 0.010076
96 Sudan 13668 836 6764 6068.000000 31.170417 0.061165
97 Montenegro 12794 190 8907 3697.000000 2037.060035 0.014851
98 Namibia 11714 126 9673 1915.000000 461.014847 0.010756
99 Finland 11049 346 8500 2203.000000 199.414589 0.031315
100 Guinea 10863 68 10176 619.000000 82.716607 0.006260
101 Congo (Kinshasa) 10804 276 10239 289.000000 12.063232 0.025546
102 Maldives 10656 34 9547 1075.000000 1971.354677 0.003191
103 Tajikistan 10055 78 8876 1101.000000 105.424381 0.007757
104 Georgia 9753 63 5235 4455.000000 244.486642 0.006460
105 Mozambique 9494 68 6812 2614.000000 30.375517 0.007162
106 Uganda 9260 85 5588 3587.000000 20.244420 0.009179
107 Luxembourg 9119 128 7900 1091.000000 1456.765116 0.014037
108 Haiti 8838 229 7013 1596.000000 77.509094 0.025911
109 Gabon 8815 54 8164 597.000000 396.050191 0.006126
110 Zimbabwe 7919 229 6441 1249.000000 53.280219 0.028918
111 Mauritania 7535 162 7212 161.000000 162.054860 0.021500
112 Jamaica 7191 126 2700 4365.000000 242.843939 0.017522
113 Slovenia 7120 159 4535 2426.000000 342.483544 0.022331
114 Cabo Verde 6624 71 5684 869.000000 1191.392620 0.010719
115 Cuba 5898 123 5321 454.000000 52.072040 0.020855
116 Malawi 5803 180 4575 1048.000000 30.334624 0.031018
117 Angola 5725 211 2598 2916.000000 17.419075 0.036856
118 Eswatini 5617 113 5196 308.000000 484.155688 0.020118
119 Lithuania 5483 101 2600 2782.000000 201.411238 0.018421
120 Djibouti 5423 61 5353 9.000000 548.885529 0.011248
121 Nicaragua 5264 153 4225 886.000000 79.461953 0.029065
122 Congo (Brazzaville) 5089 89 3887 1113.000000 92.223906 0.017489
123 Equatorial Guinea 5052 83 4894 75.000000 360.089381 0.016429
124 Suriname 4979 106 4781 92.000000 848.740441 0.021289
125 Trinidad and Tobago 4887 84 3010 1793.000000 349.198387 0.017188
126 Rwanda 4883 29 3408 1446.000000 37.700133 0.005939
127 Central African Republic 4852 62 1914 2876.000000 100.460395 0.012778
128 Bahamas 4713 102 2607 2004.000000 1198.480348 0.021642
129 Syria 4504 212 1198 3094.000000 25.736177 0.047069
130 Sri Lanka 4459 13 3274 1172.000000 20.823556 0.002915
131 Somalia 3745 99 3010 636.000000 23.563508 0.026435
132 Estonia 3715 67 2813 835.000000 280.052075 0.018035
133 Thailand 3622 59 3439 124.000000 5.189113 0.016289
134 Gambia 3613 117 2235 1261.000000 149.503613 0.032383
135 Malta 3442 41 2865 536.000000 779.546088 0.011912
136 Guyana 3292 95 2084 1113.000000 418.531858 0.028858
137 Mali 3210 131 2502 577.000000 15.851199 0.040810
138 Botswana 3172 18 834 2320.000000 134.885452 0.005675
139 Iceland 3172 10 2366 796.000000 929.523810 0.003153
140 South Sudan 2748 50 1290 1408.000000 24.549460 0.018195
141 Andorra 2568 53 1715 800.000000 3323.626480 0.020639
142 Benin 2411 41 1973 397.000000 19.887492 0.017005
143 Guinea-Bissau 2385 40 1728 617.000000 121.189148 0.016771
144 Belize 2310 34 1427 849.000000 580.955231 0.014719
145 Sierra Leone 2287 72 1716 499.000000 28.669980 0.031482
146 Latvia 2261 40 1322 899.000000 119.870512 0.017691
147 Burkina Faso 2222 59 1478 685.000000 10.629912 0.026553
148 Uruguay 2206 49 1890 267.000000 63.505278 0.022212
149 Yemen 2049 593 1328 128.000000 6.869852 0.289409
150 Togo 1898 49 1419 430.000000 22.926202 0.025817
151 Cyprus 1897 24 1369 504.000000 157.119536 0.012652
152 New Zealand 1864 25 1800 39.000000 38.654291 0.013412
153 Lesotho 1767 40 926 801.000000 82.483293 0.022637
154 Liberia 1355 82 1245 28.000000 26.790956 0.060517
155 Chad 1251 89 1090 72.000000 7.616040 0.071143
156 Niger 1200 69 1122 9.000000 4.957318 0.057500
157 Vietnam 1099 35 1023 41.000000 1.129049 0.031847
158 Sao Tome and Principe 914 15 888 11.000000 417.045003 0.016411
159 San Marino 732 42 680 10.000000 2156.874300 0.057377
160 Diamond Princess 712 13 651 48.000000 0.000000 0.018258
161 Papua New Guinea 541 7 527 7.000000 6.046701 0.012939
162 Taiwan* 523 7 486 30.000000 2.195931 0.013384
163 Burundi 515 1 472 42.000000 4.331086 0.001942
164 Tanzania 509 21 183 305.000000 0.852108 0.041257
165 Comoros 491 7 468 16.000000 56.463066 0.014257
166 Eritrea 398 0 358 40.000000 11.222563 0.000000
167 Mauritius 395 10 358 27.000000 31.059148 0.025316
168 Mongolia 315 0 308 7.000000 9.608662 0.000000
169 Bhutan 304 0 252 52.000000 39.398039 0.000000
170 Cambodia 281 0 276 5.000000 1.680725 0.000000
171 Monaco 227 2 202 23.000000 578.432372 0.008811
172 Barbados 203 7 182 14.000000 70.640392 0.034483
173 Seychelles 148 0 143 5.000000 150.498271 0.000000
174 Brunei 146 3 143 0.000000 33.372725 0.020548
175 Liechtenstein 131 1 116 14.000000 343.498440 0.007634
176 Antigua and Barbuda 108 3 97 8.000000 110.285107 0.027778
177 Saint Vincent and the Grenadines 64 0 64 0.000000 57.685201 0.000000
178 Fiji 32 2 28 2.000000 3.569660 0.062500
179 Dominica 31 0 24 7.000000 43.060938 0.000000
180 Timor-Leste 28 0 28 0.000000 2.123719 0.000000
181 Saint Lucia 27 0 27 0.000000 14.703560 0.000000
182 Grenada 24 0 24 0.000000 21.329731 0.000000
183 Laos 23 0 22 1.000000 0.316127 0.000000
184 Saint Kitts and Nevis 19 0 17 2.000000 35.719657 0.000000
185 Holy See 12 0 12 0.000000 1483.312732 0.000000
186 Western Sahara 10 1 8 1.000000 1.674116 0.100000
187 MS Zaandam 9 2 0 7.000000 0.000000 0.222222
unique_provinces =  list(latest_data['Province_State'].unique())

Getting the latest information about provinces/states that have confirmed coronavirus cases

province_confirmed_cases = []
province_country = [] 
province_death_cases = [] 
# province_recovery_cases = []
province_active = [] 
province_incidence_rate = []
province_mortality_rate = [] 

no_cases = [] 
for i in unique_provinces:
    cases = latest_data[latest_data['Province_State']==i]['Confirmed'].sum()
    if cases > 0:
        province_confirmed_cases.append(cases)
    else:
        no_cases.append(i)
 
# remove areas with no confirmed cases
for i in no_cases:
    unique_provinces.remove(i)
    
unique_provinces = [k for k, v in sorted(zip(unique_provinces, province_confirmed_cases), key=operator.itemgetter(1), reverse=True)]
for i in range(len(unique_provinces)):
    province_confirmed_cases[i] = latest_data[latest_data['Province_State']==unique_provinces[i]]['Confirmed'].sum()
    province_country.append(latest_data[latest_data['Province_State']==unique_provinces[i]]['Country_Region'].unique()[0])
    province_death_cases.append(latest_data[latest_data['Province_State']==unique_provinces[i]]['Deaths'].sum())
#     province_recovery_cases.append(latest_data[latest_data['Province_State']==unique_provinces[i]]['Recovered'].sum())
    province_active.append(latest_data[latest_data['Province_State']==unique_provinces[i]]['Active'].sum())
    province_incidence_rate.append(latest_data[latest_data['Province_State']==unique_provinces[i]]['Incidence_Rate'].sum())
    province_mortality_rate.append(province_death_cases[i]/province_confirmed_cases[i])
# # handle nan if there is any, it is usually a float: float('nan')

# for i in range(len(unique_provinces)):
#     if type(unique_provinces[i]) == float:
#         nan_indices.append(i)

# unique_provinces = list(unique_provinces)
# province_confirmed_cases = list(province_confirmed_cases)

# for i in nan_indices:
#     unique_provinces.pop(i)
#     province_confirmed_cases.pop(i)
province_limit = 100 
province_df = pd.DataFrame({'Province/State Name': unique_provinces[:province_limit], 'Country': province_country[:province_limit], 'Number of Confirmed Cases': province_confirmed_cases[:province_limit],
                          'Number of Deaths': province_death_cases[:province_limit],'Number of Active Cases' : province_active[:province_limit], 
                            'Incidence Rate' : province_incidence_rate[:province_limit], 'Mortality Rate': province_mortality_rate[:province_limit]})
# number of cases per country/region

province_df.style.background_gradient(cmap='Oranges')
Province/State Name Country Number of Confirmed Cases Number of Deaths Number of Active Cases Incidence Rate Mortality Rate
0 Maharashtra India 1480489 39072 244976.000000 1202.239913 0.026391
1 Sao Paulo Brazil 1016755 36669 85082.000000 2214.233574 0.036065
2 California US 841928 16338 825590.000000 97042.918220 0.019405
3 Texas US 803690 16661 787029.000000 585661.650126 0.020731
4 Andhra Pradesh India 734427 6086 49513.000000 1362.487515 0.008287
5 Florida US 722707 14904 707803.000000 252486.050695 0.020622
6 Karnataka India 668652 9574 116172.000000 989.676461 0.014318
7 Tamil Nadu India 635855 9984 45135.000000 816.861062 0.015702
8 New York US 468268 33226 435042.000000 68518.583673 0.070955
9 England United Kingdom 465704 37753 427951.000000 831.953009 0.081067
10 Uttar Pradesh India 424326 6200 43154.000000 178.376131 0.014611
11 Lima Peru 379347 14804 364543.000000 3569.148986 0.039025
12 Georgia US 326142 7259 318883.000000 520180.869873 0.022257
13 Bahia Brazil 319981 7021 6595.000000 2151.412782 0.021942
14 Moscow Russia 314788 5442 52984.000000 2517.001603 0.017288
15 Minas Gerais Brazil 313032 7811 26949.000000 1478.742929 0.024953
16 Illinois US 310335 9127 301208.000000 183939.380296 0.029410
17 Delhi India 298107 5616 22186.000000 1593.224535 0.018839
18 Metropolitana Chile 289158 9404 3263.000000 4065.314289 0.032522
19 Capital District Colombia 281534 6982 24365.000000 3798.063990 0.024800
20 West Bengal India 280504 5376 28361.000000 281.604219 0.019166
21 Rio de Janeiro Brazil 277439 18969 5245.000000 1606.949991 0.068372
22 Ceara Brazil 256764 9094 26437.000000 2811.671122 0.035418
23 Madrid Spain 255615 9634 205245.000000 3848.667703 0.037689
24 Kerala India 253405 906 92246.000000 709.829002 0.003575
25 Odisha India 240998 958 26368.000000 519.881490 0.003975
26 Para Brazil 235948 6623 8603.000000 2742.667704 0.028070
27 Goias Brazil 224181 5020 5460.000000 3194.210494 0.022393
28 North Carolina US 222969 3693 219276.000000 214244.682876 0.016563
29 Santa Catarina Brazil 222652 2880 8517.000000 3107.586714 0.012935
30 Arizona US 222538 5733 216806.000000 46542.262687 0.025762
31 Punjab India 221132 5959 13082.000000 492.123143 0.026948
32 New Jersey US 209850 16152 193698.000000 44057.873952 0.076969
33 Rio Grande do Sul Brazil 207706 5035 9134.000000 1825.627466 0.024241
34 Tennessee US 207455 2642 204813.000000 307984.113874 0.012735
35 Telangana India 206644 1201 26368.000000 524.973724 0.005812
36 Distrito Federal Brazil 197369 3378 7613.000000 6545.653653 0.017115
37 Bihar India 191985 927 11326.000000 153.834226 0.004829
38 Assam India 190209 785 31786.000000 534.189321 0.004127
39 Parana Brazil 187729 4703 46125.000000 1641.855046 0.025052
40 Maranhao Brazil 176995 3828 6041.000000 2501.632114 0.021628
41 Pennsylvania US 171527 8247 163280.000000 58419.326863 0.048080
42 Louisiana US 170097 5604 164493.000000 254401.938988 0.032946
43 Ohio US 162723 4970 157753.000000 101558.871904 0.030543
44 Amazonas Brazil 162616 4534 35454.000000 10676.880605 0.027882
45 Alabama US 161418 2601 158817.000000 240283.508340 0.016113
46 South Carolina US 153705 3502 150203.000000 145465.750509 0.022784
47 Virginia US 153451 3300 150151.000000 234650.892269 0.021505
48 Pernambuco Brazil 151139 8379 11625.000000 1581.436405 0.055439
49 Rajasthan India 150467 1590 21351.000000 185.686791 0.010567
50 Catalonia Spain 150438 5861 118374.000000 1988.229325 0.038960
51 Gujarat India 146673 3531 16485.000000 229.634400 0.024074
52 Michigan US 145092 7169 137923.000000 73053.547245 0.049410
53 Madhya Pradesh India 140307 2518 17522.000000 164.372893 0.017946
54 Sindh Pakistan 139195 2535 4477.000000 290.679639 0.018212
55 Wisconsin US 138698 1415 137283.000000 142083.779635 0.010202
56 Missouri US 137420 2238 135183.000000 217413.591916 0.016286
57 Haryana India 137398 1528 11029.000000 487.145898 0.011121
58 Espirito Santo Brazil 136590 3617 7170.000000 3398.902617 0.026481
59 Massachusetts US 136492 9557 126935.000000 19991.555711 0.070019
60 Ciudad de Mexico Mexico 136154 13779 7226.000000 1509.694638 0.101202
61 Chhattisgarh India 131739 1134 26777.000000 447.540312 0.008608
62 Mato Grosso Brazil 129038 3525 15023.000000 3703.236020 0.027318
63 Maryland US 128664 3973 124691.000000 39357.659455 0.030879
64 Indiana US 128227 3727 124500.000000 145061.251419 0.029066
65 Paraiba Brazil 124315 2884 22232.000000 3093.854425 0.023199
66 Antioquia Colombia 122871 2568 10894.000000 1917.731293 0.020900
67 Lombardia Italy 109186 16978 9877.000000 1085.285989 0.155496
68 Minnesota US 106651 2154 104497.000000 142671.198402 0.020197
69 Mississippi US 102241 3051 99190.000000 323480.703516 0.029841
70 Piaui Brazil 99960 2180 -74.000000 3053.867025 0.021809
71 Iowa US 95093 1419 93674.000000 263925.786771 0.014922
72 Oklahoma US 94352 1075 93277.000000 164942.077198 0.011394
73 Washington US 91208 2177 89031.000000 49287.775914 0.023869
74 Jharkhand India 89702 767 9759.000000 232.425042 0.008551
75 Arkansas US 88880 1482 87398.000000 226105.098240 0.016674
76 Alagoas Brazil 88426 2115 1053.000000 2649.581690 0.023918
77 Mexico Mexico 87230 9965 3359.000000 500.522441 0.114238
78 Nevada US 83347 1636 81711.000000 16174.700927 0.019629
79 Quebec Canada 81914 5906 8273.000000 959.441647 0.072100
80 Jammu and Kashmir India 81097 1282 12131.000000 596.024494 0.015808
81 Utah US 80446 496 79950.000000 26861.823824 0.006166
82 Sergipe Brazil 78692 2072 4459.000000 3423.332185 0.026331
83 Kentucky US 76587 1223 75364.000000 167179.116739 0.015969
84 Moscow Oblast Russia 75712 1384 16752.000000 1009.037921 0.018280
85 Nordrhein-Westfalen Germany 75671 1894 8626.000000 421.973304 0.025029
86 Colorado US 74899 2085 72814.000000 63314.256120 0.027837
87 Mato Grosso do Sul Brazil 73027 1385 5027.000000 2627.828999 0.018966
88 Rio Grande do Norte Brazil 71898 2412 28037.000000 2050.214252 0.033548
89 Bayern Germany 71034 2681 5078.000000 543.209571 0.037742
90 Andalusia Spain 70560 1941 57948.000000 837.268412 0.027509
91 Tocantins Brazil 69969 985 14187.000000 4448.503560 0.014078
92 Atlantico Colombia 68465 3080 1370.000000 2700.238255 0.044986
93 Hubei China 68139 4512 0.000000 115.158019 0.066218
94 Rondonia Brazil 67181 1385 7013.000000 3780.106627 0.020616
95 Valle del Cauca Colombia 66796 2395 6447.000000 1492.352576 0.035855
96 Kansas US 62941 713 62228.000000 197169.529217 0.011328
97 Connecticut US 59364 4522 54842.000000 10522.703133 0.076174
98 Ontario Canada 58202 3039 5431.000000 395.613679 0.052215
99 Uttarakhand India 52959 688 8367.000000 470.710767 0.012991
us_states = list(latest_data[latest_data['Country_Region']=='US']['Province_State'].unique())
state_confirmed_cases = []
state_death_cases = [] 
# state_recovery_cases = []
state_active = [] 
state_incidence_rate = [] 
state_mortality_rate = [] 

no_cases = [] 
for i in us_states:
    cases = latest_data[latest_data['Province_State']==i]['Confirmed'].sum()
    if cases > 0:
        state_confirmed_cases.append(cases)
    else:
        no_cases.append(i)
 
# remove areas with no confirmed cases
for i in no_cases:
    us_states.remove(i)
    
us_states = [k for k, v in sorted(zip(us_states, state_confirmed_cases), key=operator.itemgetter(1), reverse=True)]
for i in range(len(us_states)):
    state_confirmed_cases[i] = latest_data[latest_data['Province_State']==us_states[i]]['Confirmed'].sum()
    state_death_cases.append(latest_data[latest_data['Province_State']==us_states[i]]['Deaths'].sum())
#     state_recovery_cases.append(latest_data[latest_data['Province_State']==us_states[i]]['Recovered'].sum())
    state_active.append(latest_data[latest_data['Province_State']==us_states[i]]['Active'].sum())
    state_incidence_rate.append(latest_data[latest_data['Province_State']==us_states[i]]['Incidence_Rate'].sum())
    state_mortality_rate.append(state_death_cases[i]/state_confirmed_cases[i])
state_df = pd.DataFrame({'State Name': us_states, 'Number of Confirmed Cases': state_confirmed_cases,
                          'Number of Deaths': state_death_cases, 'Number of Active Cases' : state_active, 
                         'Incidence Rate' : state_incidence_rate, 'Mortality Rate': state_mortality_rate})
# number of cases per country/region

state_df.style.background_gradient(cmap='Oranges')
State Name Number of Confirmed Cases Number of Deaths Number of Active Cases Incidence Rate Mortality Rate
0 California 841928 16338 825590.000000 97042.918220 0.019405
1 Texas 803690 16661 787029.000000 585661.650126 0.020731
2 Florida 722707 14904 707803.000000 252486.050695 0.020622
3 New York 468268 33226 435042.000000 68518.583673 0.070955
4 Georgia 326142 7259 318883.000000 520180.869873 0.022257
5 Illinois 310335 9127 301208.000000 183939.380296 0.029410
6 North Carolina 222969 3693 219276.000000 214244.682876 0.016563
7 Arizona 222538 5733 216806.000000 46542.262687 0.025762
8 New Jersey 209850 16152 193698.000000 44057.873952 0.076969
9 Tennessee 207455 2642 204813.000000 307984.113874 0.012735
10 Pennsylvania 171527 8247 163280.000000 58419.326863 0.048080
11 Louisiana 170097 5604 164493.000000 254401.938988 0.032946
12 Ohio 162723 4970 157753.000000 101558.871904 0.030543
13 Alabama 161418 2601 158817.000000 240283.508340 0.016113
14 South Carolina 153705 3502 150203.000000 145465.750509 0.022784
15 Virginia 153451 3300 150151.000000 234650.892269 0.021505
16 Michigan 145092 7169 137923.000000 73053.547245 0.049410
17 Wisconsin 138698 1415 137283.000000 142083.779635 0.010202
18 Missouri 137420 2238 135183.000000 217413.591916 0.016286
19 Massachusetts 136492 9557 126935.000000 19991.555711 0.070019
20 Maryland 128664 3973 124691.000000 39357.659455 0.030879
21 Indiana 128227 3727 124500.000000 145061.251419 0.029066
22 Minnesota 106651 2154 104497.000000 142671.198402 0.020197
23 Mississippi 102241 3051 99190.000000 323480.703516 0.029841
24 Iowa 95093 1419 93674.000000 263925.786771 0.014922
25 Oklahoma 94352 1075 93277.000000 164942.077198 0.011394
26 Washington 91208 2177 89031.000000 49287.775914 0.023869
27 Arkansas 88880 1482 87398.000000 226105.098240 0.016674
28 Nevada 83347 1636 81711.000000 16174.700927 0.019629
29 Utah 80446 496 79950.000000 26861.823824 0.006166
30 Kentucky 76587 1223 75364.000000 167179.116739 0.015969
31 Colorado 74899 2085 72814.000000 63314.256120 0.027837
32 Kansas 62941 713 62228.000000 197169.529217 0.011328
33 Connecticut 59364 4522 54842.000000 10522.703133 0.076174
34 Puerto Rico 51768 705 51063.000000 94248.412403 0.013618
35 Nebraska 49396 507 48889.000000 157705.186149 0.010264
36 Idaho 45753 500 45253.000000 94813.744044 0.010928
37 Oregon 35634 583 35051.000000 35097.561461 0.016361
38 New Mexico 31372 896 30476.000000 39426.418792 0.028560
39 South Dakota 25906 258 25648.000000 174517.634859 0.009959
40 Rhode Island 25776 1126 24650.000000 6613.000880 0.043684
41 North Dakota 24857 304 24553.000000 144997.608182 0.012230
42 Delaware 21550 649 20901.000000 6781.963903 0.030116
43 West Virginia 17150 375 16775.000000 41123.215805 0.021866
44 Montana 16063 193 15870.000000 84543.077213 0.012015
45 District of Columbia 15652 631 15021.000000 2217.785643 0.040314
46 Hawaii 13045 163 12882.000000 1941.692261 0.012495
47 Alaska 8878 59 8819.000000 23052.109194 0.006646
48 New Hampshire 8731 446 8285.000000 3794.887990 0.051082
49 Wyoming 6899 53 6846.000000 26492.209596 0.007682
50 Maine 5603 142 5461.000000 4121.804077 0.025344
51 Guam 2868 57 2811.000000 1746.341998 0.019874
52 Vermont 1827 58 1769.000000 3107.296719 0.031746
53 Virgin Islands 1322 20 1302.000000 1232.427192 0.015129
54 Grand Princess 116 3 100.000000 0.000000 0.025862
55 Northern Mariana Islands 75 2 73.000000 136.007544 0.026667
56 Diamond Princess 49 1 49.000000 0.000000 0.020408

Bar Chart Visualizations for COVID-19

us_confirmed = latest_data[latest_data['Country_Region']=='US']['Confirmed'].sum()
outside_us_confirmed = np.sum(country_confirmed_cases) - us_confirmed
plt.figure(figsize=(16, 9))
plt.barh('United States', us_confirmed)
plt.barh('Outside United States', outside_us_confirmed)
plt.title('# of Coronavirus Confirmed Cases', size=20)
plt.xticks(size=20)
plt.yticks(size=20)
plt.show()
print('Outside United States {} cases:'.format(outside_us_confirmed))
print('United States {} cases'.format(us_confirmed))
print('Total: {} cases'.format(us_confirmed+outside_us_confirmed))
Outside United States 28606544 cases:
United States 7549682 cases
Total: 36156226 cases

Only show 15 countries with the most confirmed cases, the rest are grouped into the other category

visual_unique_countries = [] 
visual_confirmed_cases = []
others = np.sum(country_confirmed_cases[10:])

for i in range(len(country_confirmed_cases[:10])):
    visual_unique_countries.append(unique_countries[i])
    visual_confirmed_cases.append(country_confirmed_cases[i])
    
visual_unique_countries.append('Others')
visual_confirmed_cases.append(others)

Visual Representations (bar charts and pie charts)

def plot_bar_graphs(x, y, title):
    plt.figure(figsize=(16, 12))
    plt.barh(x, y)
    plt.title(title, size=20)
    plt.xticks(size=20)
    plt.yticks(size=20)
    plt.show()
    
def plot_bar_graphs_tall(x, y, title):
    plt.figure(figsize=(19, 18))
    plt.barh(x, y)
    plt.title(title, size=25)
    plt.xticks(size=25)
    plt.yticks(size=25)
    plt.show()
plot_bar_graphs(visual_unique_countries, visual_confirmed_cases, '# of Covid-19 Confirmed Cases in Countries/Regions')
log_country_confirmed_cases = [math.log10(i) for i in visual_confirmed_cases]
plot_bar_graphs(visual_unique_countries, log_country_confirmed_cases, 'Common Log # of Coronavirus Confirmed Cases in Countries/Regions')

Only show 10 provinces with the most confirmed cases, the rest are grouped into the other category

visual_unique_provinces = [] 
visual_confirmed_cases2 = []
others = np.sum(province_confirmed_cases[10:])
for i in range(len(province_confirmed_cases[:10])):
    visual_unique_provinces.append(unique_provinces[i])
    visual_confirmed_cases2.append(province_confirmed_cases[i])

visual_unique_provinces.append('Others')
visual_confirmed_cases2.append(others)
plot_bar_graphs(visual_unique_provinces, visual_confirmed_cases2, '# of Coronavirus Confirmed Cases in Provinces/States')
log_province_confirmed_cases = [math.log10(i) for i in visual_confirmed_cases2]
plot_bar_graphs(visual_unique_provinces, log_province_confirmed_cases, 'Log of # of Coronavirus Confirmed Cases in Provinces/States')

Pie Chart Visualizations for COVID-19

def plot_pie_charts(x, y, title):
    # more muted color 
    c = ['lightcoral', 'rosybrown', 'sandybrown', 'navajowhite', 'gold',
        'khaki', 'lightskyblue', 'turquoise', 'lightslategrey', 'thistle', 'pink']
    plt.figure(figsize=(20,15))
    plt.title(title, size=20)
    plt.pie(y, colors=c,shadow=True, labels=y)
    plt.legend(x, loc='best', fontsize=12)
    plt.show()
plot_pie_charts(visual_unique_countries, visual_confirmed_cases, 'Covid-19 Confirmed Cases per Country')
plot_pie_charts(visual_unique_provinces, visual_confirmed_cases2, 'Covid-19 Confirmed Cases per State/Province/Region')

Plotting countries with regional data using a pie chart

def plot_pie_country_with_regions(country_name, title):
    regions = list(latest_data[latest_data['Country_Region']==country_name]['Province_State'].unique())
    confirmed_cases = []
    no_cases = [] 
    for i in regions:
        cases = latest_data[latest_data['Province_State']==i]['Confirmed'].sum()
        if cases > 0:
            confirmed_cases.append(cases)
        else:
            no_cases.append(i)

    # remove areas with no confirmed cases
    for i in no_cases:
        regions.remove(i)

    # only show the top 5 states
    regions = [k for k, v in sorted(zip(regions, confirmed_cases), key=operator.itemgetter(1), reverse=True)]

    for i in range(len(regions)):
        confirmed_cases[i] = latest_data[latest_data['Province_State']==regions[i]]['Confirmed'].sum()  
    
    # additional province/state will be considered "others"
    
    if(len(regions)>5):
        regions_5 = regions[:5]
        regions_5.append('Others')
        confirmed_cases_5 = confirmed_cases[:5]
        confirmed_cases_5.append(np.sum(confirmed_cases[5:]))
        plot_pie_charts(regions_5,confirmed_cases_5, title)
    else:
        plot_pie_charts(regions,confirmed_cases, title)
pie_chart_countries = ['US', 'Brazil', 'Russia', 'India', 'Peru', 'Mexico', 'Canada', 
                       'Australia', 'China', 'Italy', 'Germany', 'France', 'United Kingdom', 'Chile']

for i in pie_chart_countries:
    plot_pie_country_with_regions(i, 'Covid-19 Confirmed Cases in {}'.format(i))

US Medical Data on Testing

us_medical_data.fillna(value=0, inplace=True)

def plot_us_medical_data():
    states = us_medical_data['Province_State'].unique()
    testing_number = []
    testing_rate = []
    
    for i in states:
        testing_number.append(us_medical_data[us_medical_data['Province_State']==i]['People_Tested'].sum())
        testing_rate.append(us_medical_data[us_medical_data['Province_State']==i]['Testing_Rate'].max())
    
    # only show the top 15 states
    testing_states = [k for k, v in sorted(zip(states, testing_number), key=operator.itemgetter(1), reverse=True)]
    testing_rate_states = [k for k, v in sorted(zip(states, testing_rate), key=operator.itemgetter(1), reverse=True)]
  
    for i in range(len(states)):
        testing_number[i] = us_medical_data[us_medical_data['Province_State']==testing_states[i]]['People_Tested'].sum()
        testing_rate[i] = us_medical_data[us_medical_data['Province_State']==testing_rate_states[i]]['Testing_Rate'].sum()
    
    top_limit = 30 
    
    plot_bar_graphs_tall(testing_states[:top_limit], testing_number[:top_limit], 'Total Testing per State (Top 30)')
    plot_bar_graphs_tall(testing_rate_states[:top_limit], testing_rate[:top_limit], 'Testing Rate per 100,000 People (Top 30)')
 

plot_us_medical_data()

Taking a look at Apple's mobility data. It can help us understand hotspot states in the US (states and territories).

def get_mobility_by_state(transport_type, state, day):
    return apple_mobility[apple_mobility['sub-region']==state][apple_mobility['transportation_type']==transport_type].sum()[day]
apple_mobility.head()
geo_type region transportation_type alternative_name sub-region country 2020-01-13 2020-01-14 2020-01-15 2020-01-16 ... 2020-09-28 2020-09-29 2020-09-30 2020-10-01 2020-10-02 2020-10-03 2020-10-04 2020-10-05 2020-10-06 2020-10-07
0 country/region Albania driving NaN NaN NaN 100.0 95.30 101.43 97.20 ... 117.30 116.38 119.24 118.79 130.25 148.03 136.67 123.11 117.50 119.25
1 country/region Albania walking NaN NaN NaN 100.0 100.68 98.93 98.46 ... 144.95 160.09 159.83 154.84 159.32 166.40 130.23 168.27 140.04 154.46
2 country/region Argentina driving NaN NaN NaN 100.0 97.07 102.45 111.21 ... 54.62 59.16 60.03 61.63 74.42 71.69 38.69 55.99 59.95 62.76
3 country/region Argentina walking NaN NaN NaN 100.0 95.11 101.37 112.67 ... 48.11 51.81 50.20 50.76 57.44 51.42 29.54 46.80 49.56 53.87
4 country/region Australia driving AU NaN NaN 100.0 102.98 104.21 108.63 ... 97.69 100.19 104.14 115.34 109.16 92.42 93.21 96.03 97.06 97.95

5 rows × 275 columns

sample testing

get_mobility_by_state('walking', 'Connecticut', '2020-07-30')
1104.74
revised_dates = []
for i in range(len(dates)):
    revised_dates.append(datetime.datetime.strptime(dates[i], '%m/%d/%y').strftime('%Y-%m-%d'))
def weekday_or_weekend(date):
    date_obj = datetime.datetime.strptime(date, '%Y-%m-%d')
    day_of_the_week =  date_obj.weekday()
    if (day_of_the_week+1) % 6 == 0 or (day_of_the_week+1) % 7 == 0:
        return True 
    else:
        return False 
revised_day_since_1_22 = [i for i in range(len(revised_dates))]
import matplotlib.dates as mdates
states = ['New York', 'Connecticut', 'Florida', 'California', 'Texas', 'Georgia', 'Arizona', 'Illinois', 'Louisiana', 'Ohio',
          'Tennessee', 'North Carolina', 'South Carolina', 'Alabama', 'Missouri', 'Kansas', 'Pennsylvania', 'Wisconsin', 'Virginia', 'Massachusetts', 'Utah', 'Minnesota',
         'Oklahoma', 'Iowa', 'Arkansas', 'Kentucky', 'Puerto Rico', 'Colorado', 'New Jersey', 'Idaho', 'New Jersey', 'Nevada', 'Maryland']
states.sort()

# making sure the dates are in sync 
mobility_latest_date = apple_mobility.columns[-1]
mobility_latest_index = revised_dates.index(mobility_latest_date)

for state in states:
    # weekend and weekday mobility are separated 
    weekday_mobility = []
    weekday_mobility_dates = [] 
    weekend_mobility = [] 
    weekend_mobility_dates = [] 
    
    for i in range(len(revised_dates)):
        if i <= mobility_latest_index:
            if weekday_or_weekend(revised_dates[i]):
                weekend_mobility.append(get_mobility_by_state('walking', state, revised_dates[i]))
                weekend_mobility_dates.append(i)
            else:
                weekday_mobility.append(get_mobility_by_state('walking', state, revised_dates[i]))
                weekday_mobility_dates.append(i)
        else:
            pass
        
    # remove null values (they are counted as 0)
    for i in range(len(weekend_mobility)):       
        if weekend_mobility[i] == 0 and i != 0:
            weekend_mobility[i] = weekend_mobility[i-1]
        elif weekend_mobility[i] == 0 and i == 0:
            weekend_mobility[i] = weekend_mobility[i+1]
        else:
            pass
        
    for i in range(len(weekday_mobility)):       
        if weekday_mobility[i] == 0 and i != 0:
            weekday_mobility[i] = weekday_mobility[i-1]
        elif weekday_mobility[i] == 0 and i == 0:
            weekday_mobility[i] = weekday_mobility[i+1]
        else:
            pass
    
    
    weekday_mobility_average = moving_average(weekday_mobility, 7)
    weekend_mobility_average = moving_average(weekend_mobility, 7)
    
    plt.figure(figsize=(16, 10))
    plt.bar(weekday_mobility_dates, weekday_mobility, color='cornflowerblue')
    plt.plot(weekday_mobility_dates, weekday_mobility_average, color='green')
    
    plt.bar(weekend_mobility_dates, weekend_mobility, color='salmon')
    plt.plot(weekend_mobility_dates, weekend_mobility_average, color='black')
    
    plt.legend(['Moving average (7 days) weekday mobility', 'Moving Average (7 days) weekend mobility', 'Weekday mobility', 'Weekend mobility'], prop={'size': 25})
    plt.title('{} Walking Mobility Data'.format(state), size=25)
    plt.xlabel('Days since 1/22', size=25)
    plt.ylabel('Mobility Value', size=25)
    plt.xticks(size=25)
    plt.yticks(size=25)
    plt.show()